/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/kernel.h>
#include <net/sock.h>
#include <linux/in.h>
#include <linux/list.h>

#include "rds.h"

/* must be serialized with sending which uses the c_xmit_rm ref */
void rds_send_reset(struct rds_connection *conn)
{
	assert_spin_locked(&conn->c_lock);

	if (conn->c_xmit_rm) {
		rds_message_put(conn->c_xmit_rm);
		conn->c_xmit_rm = NULL;
	}
	conn->c_xmit_sg = 0;
	conn->c_xmit_hdr_off = 0;
	conn->c_xmit_data_off = 0;

	conn->c_map_queued = 0;
	conn->c_map_offset = 0;
	conn->c_map_bytes = 0;
}

/*
 * We're making the concious trade-off here to only send one message
 * down the connection at a time.  
 *   Pro:
 *      - tx queueing is a simple fifo list
 *   	- reassembly is optional and easily done by transports per conn
 *      - no per flow rx lookup at all, straight to the socket
 *   	- less per-frag memory and wire overhead
 *   Con:
 *      - queued acks can be delayed behind large messages
 *   Depends:
 *      - small message latency is higher behind queued large messages
 *      - large message latency isn't starved by intervening small sends
 */
int rds_send_xmit(struct rds_connection *conn)
{
	struct rds_message *rm;
	unsigned long flags;
	unsigned int tmp;
	struct scatterlist *sg;
	int ret = 0;
	int was_empty = 0;

	/* 
	 * sendmsg calls here after having queued its message on the send
	 * queue.  We only have one task feeding the connection at a time.  If
	 * another thread is already feeding the queue then we back off.  This
	 * avoids blocking the caller and trading per-connection data between
	 * caches per message.
	 * 
	 * The sem holder will issue a retry if they notice that someone queued
	 * a message after they stopped walking the send queue but before they
	 * dropped the sem.
	 */
	if (down_trylock(&conn->c_send_sem)) {
		rds_stats_inc(s_send_sem_contention);
		goto out;
	}

	if (conn->c_trans->xmit_prepare)
		conn->c_trans->xmit_prepare(conn);

	/* 
	 * spin trying to push headers and data down the connection until
	 * the connection doens't make forward progress.
	 */
	for(;;) {
		/* 
		 * See if need to send a congestion map update if we're
		 * between sending messages.  The send_sem protects our sole
		 * use of c_map_offset and _bytes.
		 */
		if (conn->c_xmit_hdr_off == 0 && conn->c_map_bytes) {
			ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
						conn->c_map_offset);
			if (ret <= 0)
				break;

			conn->c_map_offset += ret;
			conn->c_map_bytes -= ret;
			if (conn->c_map_bytes)
				continue;
		}

		if (test_and_clear_bit(0, &conn->c_map_queued)) {
			conn->c_map_offset = 0;
			conn->c_map_bytes = sizeof(struct rds_header) +
					    RDS_CONG_MAP_BYTES;
			continue;
		}

		spin_lock_irqsave(&conn->c_lock, flags);
		/*
		 * Move the message from the send queue to the retransmit
		 * list after we've sent if it.  It might have been canceled
		 * or acked while we held our reference to it without holding
		 * the conn lock.
		 */
		if (conn->c_xmit_rm &&
		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
		    conn->c_xmit_sg == conn->c_xmit_rm->m_nents) {
			rm = conn->c_xmit_rm;
			conn->c_xmit_rm = NULL;
			conn->c_xmit_sg = 0;
			conn->c_xmit_hdr_off = 0;
			conn->c_xmit_data_off = 0;

			if (test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
				list_move_tail(&rm->m_conn_item,
					       &conn->c_retrans);
			rds_message_put(rm);
		}

		/*
		 * c_xmit_rm holds a ref while we're sending this message down
		 * the connction.  We can use this ref while holding the
		 * send_sem.. rds_send_reset() is serialized with it.
		 */
		if (conn->c_xmit_rm == NULL) {
			if (list_empty(&conn->c_send_queue)) {
				spin_unlock_irqrestore(&conn->c_lock, flags);
				was_empty = 1;
				break;
			}
			rm = list_entry(conn->c_send_queue.next,
					struct rds_message,
					m_conn_item);
			rds_message_addref(rm);
			conn->c_xmit_rm = rm;
		} else
			rm = conn->c_xmit_rm;
		spin_unlock_irqrestore(&conn->c_lock, flags);

		if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
		    conn->c_xmit_sg < rm->m_nents) {
			ret = conn->c_trans->xmit(conn, rm,
						  conn->c_xmit_hdr_off,
						  conn->c_xmit_sg,
						  conn->c_xmit_data_off);
			if (ret <= 0)
				break;

			if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
				tmp = min_t(int, ret,
					    sizeof(struct rds_header) -
					    conn->c_xmit_hdr_off);
				conn->c_xmit_hdr_off += tmp;
				ret -= tmp;
			}

			sg = &rm->m_sg[conn->c_xmit_sg];
			while (ret) {
				tmp = min_t(int, ret, sg->length -
						      conn->c_xmit_data_off);
				conn->c_xmit_data_off += tmp;
				ret -= tmp;
				if (conn->c_xmit_data_off == sg->length) {
					conn->c_xmit_data_off = 0;
					sg++;
					conn->c_xmit_sg++;
					BUG_ON(ret != 0 &&
					       conn->c_xmit_sg == rm->m_nents);
				}
			}
		}
	}

	if (conn->c_trans->xmit_complete)
		conn->c_trans->xmit_complete(conn);

	/*
	 * We might be racing with another sender who queued a message but
	 * backed off on noticing that we held the c_send_sem.  If we check
	 * for queued messages after dropping the sem then either we'll
	 * see the queued message or the queuer will get the sem.  If we
	 * notice the queued message then we trigger an immediate retry.
	 *
	 * We need to be careful only to do this when we stopped processing
	 * the send queue because it was empty.  It's the only way we
	 * stop processing the loop when the transport hasn't taken 
	 * responsibility for forward progress.
	 */
	up(&conn->c_send_sem);

	if (ret == 0 && was_empty) {
		spin_lock_irqsave(&conn->c_lock, flags);
		if (!list_empty(&conn->c_send_queue)) {
			rds_stats_inc(s_send_sem_queue_raced);
			ret = -EAGAIN;
		}
		spin_unlock_irqrestore(&conn->c_lock, flags);
	}
out:
	return ret;
}

static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
{
	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);

	assert_spin_locked(&rs->rs_lock);

	BUG_ON(rs->rs_snd_bytes < len);
	rs->rs_snd_bytes -= len;

	if (rs->rs_snd_bytes == 0)
		rds_stats_inc(s_send_queue_empty);
}

static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
				    is_acked_func is_acked)
{
	if (is_acked)
		return is_acked(rm, ack);
	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
}

/*
 * Transports call here when they've determined that the receiver queued
 * messages up to, and including, the given sequence number.  Messages are
 * left at the head of the send queue while the transport works on them. 
 * We might be racing with that sender reacquiring the lock and moving
 * the message to the retransmit queue.  So we need to check the send
 * queue, too.
 *
 * XXX It's not clear to me how this is safely serialized with socket
 * destruction.  Maybe it should bail if it sees SOCK_DEAD.
 */
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
			 is_acked_func is_acked)
{
	struct rds_message *rm, *tmp;
	unsigned long flags;
	struct rds_sock *rs;
	LIST_HEAD(list);
	int wake = 0;

	spin_lock_irqsave(&conn->c_lock, flags);

	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
		if (!rds_send_is_acked(rm, ack, is_acked))
			break;

		wake = 1;
		list_move(&rm->m_conn_item, &list);
		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
	}

	/*
	 * We only consider the send queue if we're not using m_ack_seq.
	 * If we are using m_ack_seq then it is only valid once it has been
	 * set in ->xmit and messages have been moved to the retrans list.
	 */
	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
		if (is_acked || !rds_send_is_acked(rm, ack, is_acked))
			break;

		wake = 1;
		list_move(&rm->m_conn_item, &list);
		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
	}

	/* order flag updates with spin locks */
	if (wake)
		smp_mb__after_clear_bit();

	spin_unlock_irqrestore(&conn->c_lock, flags);

	rs = NULL;

	/* now remove the messages from the sock list as needed */
	while (!list_empty(&list)) {
		rm = list_entry(list.next, struct rds_message, m_conn_item);
		list_del_init(&rm->m_conn_item);

		rds_stats_inc(s_send_drop_acked);

		/*
		 * If we see this flag cleared then we're *sure* that someone
		 * else beat us to removing it from the sock.  If we race
		 * with their flag update we'll get the lock and then really
		 * see that the flag has been cleared.
		 */
		if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
			rds_message_put(rm);
			continue;
		}

		if (rs != rm->m_rs) {
			if (rs) {
				spin_unlock_irqrestore(&rs->rs_lock, flags);
				if (wake)
					rds_wake_sk_sleep(rs);
			}
			rs = rm->m_rs;
			spin_lock_irqsave(&rs->rs_lock, flags);
		}

		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
			list_del_init(&rm->m_sock_item);
			rds_send_sndbuf_remove(rs, rm);
			rds_message_put(rm);
		}

		rds_message_put(rm);
	}

	if (rs) {
		spin_unlock_irqrestore(&rs->rs_lock, flags);
		if (wake)
			rds_wake_sk_sleep(rs);
	}
}

void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
	struct rds_message *rm, *tmp;
	struct rds_connection *conn;
	unsigned long flags;
	LIST_HEAD(list);
	int wake = 0;

	/* get all the messages we're dropping under the rs lock */
	spin_lock_irqsave(&rs->rs_lock, flags);

	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
		if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
			continue;

		wake = 1;
		list_move(&rm->m_sock_item, &list);
		rds_send_sndbuf_remove(rs, rm);
		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
	}

	/* order flag updates with the rs lock */
	if (wake)
		smp_mb__after_clear_bit();

	spin_unlock_irqrestore(&rs->rs_lock, flags);

	if (wake)
		rds_wake_sk_sleep(rs);

	conn = NULL;

	/* now remove the messages from the conn list as needed */
	while (!list_empty(&list)) {
		rm = list_entry(list.next, struct rds_message, m_sock_item);
		list_del_init(&rm->m_sock_item);

		/*
		 * If we see this flag cleared then we're *sure* that someone
		 * else beat us to removing it from the conn.  If we race
		 * with their flag update we'll get the lock and then really
		 * see that the flag has been cleared.
		 */
		if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
			rds_message_put(rm);
			continue;
		}

		if (conn != rm->m_inc.i_conn) {
			if (conn)
				spin_unlock_irqrestore(&conn->c_lock, flags);
			conn = rm->m_inc.i_conn;
			spin_lock_irqsave(&conn->c_lock, flags);
		}

		if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
			list_del_init(&rm->m_conn_item);
			rds_message_put(rm);
		}

		rds_message_put(rm);
	}

	if (conn)
		spin_unlock_irqrestore(&conn->c_lock, flags);
}

/*
 * we only want this to fire once so we use the callers 'queued'.  It's
 * possible that another thread can race with us and remove the
 * message from the flow with RDS_CANCEL_SENT_TO.
 */
static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
			     struct rds_message *rm, __be16 sport,
			     __be16 dport, int *queued)
{
	unsigned long flags;
	u32 new, len;

	if (*queued)
		goto out;

	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);

	/* this is the only place which holds both locks */
	spin_lock_irqsave(&rs->rs_lock, flags);
	spin_lock(&conn->c_lock);

	new = rs->rs_snd_bytes + len;
	if (new >= rs->rs_snd_bytes && new <= rds_sk_sndbuf(rs)) {
		rs->rs_snd_bytes = new;

		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
		rds_message_addref(rm);
		rm->m_rs = rs;

		list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
		set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
		rds_message_addref(rm);
		rm->m_inc.i_conn = conn;

		rds_message_populate_header(rm, sport, dport, 
					    conn->c_next_tx_seq);
		conn->c_next_tx_seq++;

		rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
			 rm, len, rs, rs->rs_snd_bytes,
			 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));

		*queued = 1;
	}

	spin_unlock(&conn->c_lock);
	spin_unlock_irqrestore(&rs->rs_lock, flags);
out:
	return *queued;
}

int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
		size_t payload_len)
{
	struct sock *sk = sock->sk;
	struct rds_sock *rs = rds_sk_to_rs(sk);
	struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
	__be32 daddr;
	__be16 dport;
	struct rds_message *rm = NULL;
	struct rds_connection *conn;
	int ret = 0;
	int queued = 0;
	int nonblock = msg->msg_flags & MSG_DONTWAIT;
	long timeo = sock_rcvtimeo(sk, nonblock);

	/* Mirror Linux UDP mirror of BSD error message compatibility */
	/* XXX: Perhaps MSG_MORE someday */
	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
		printk("msg_flags 0x%08X\n", msg->msg_flags);
		ret = -EOPNOTSUPP;
		goto out;
	}

	if (msg->msg_namelen) {
		/* XXX fail non-unicast destination IPs? */
		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET ||
		    usin->sin_port == 0) {
			ret = -EINVAL;
			goto out;
		}
		daddr = usin->sin_addr.s_addr;
		dport = usin->sin_port;
	} else {
		/* We only care about consistency with ->connect() */
		lock_sock(sk);
		daddr = rs->rs_conn_addr;
		dport = rs->rs_conn_port;
		release_sock(sk);
	}

	/* racing with another thread binding seems ok here */
	if (daddr == 0 || rs->rs_bound_addr == 0) {
		ret = -ENOTCONN; /* XXX not a great errno */
		goto out;
	}

	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
	if (IS_ERR(rm)) {
		ret = PTR_ERR(rm);
		rm = NULL;
		goto out;
	}

	rm->m_daddr = daddr;

	conn = rds_conn_create(rs->rs_bound_addr, daddr,  rs->rs_transport,
			       sock->sk->sk_allocation);
	if (IS_ERR(conn)) {
		ret = PTR_ERR(conn);
		goto out;
	}

	if (!test_bit(RDS_CONN_CONNECTING, &conn->c_status) &&
	    !test_bit(RDS_CONN_CONNECTED, &conn->c_status))
		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);

	ret = rds_cong_wait(conn->c_fcong, dport, nonblock);
	if (ret)
		goto out;

	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, 
				  dport, &queued)) {
		rds_stats_inc(s_send_queue_full);
		/* XXX make sure this is reasonable */
		if (payload_len > rds_sk_sndbuf(rs)) {
			ret = -EMSGSIZE;
			goto out;
		}
		if (nonblock) {
			ret = -EAGAIN;
			goto out;
		}

		timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
					rds_send_queue_rm(rs, conn, rm,
							  rs->rs_bound_port,
							  dport,
							  &queued),
					timeo);
		rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
		if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
			continue;

		ret = timeo;
		if (ret == 0)
			ret = -ETIMEDOUT;
		goto out;
	}

	/*
	 * By now we've committed to the send.  We reuse rds_send_worker()
	 * to retry sends in the rds thread if the transport asks us to.
	 */ 
	rds_send_worker(&conn->c_send_w.work);
	ret = payload_len;

out:
	if (rm)
		rds_message_put(rm);
	return ret;
}
